Load Libraries
import numpy as np
import pandas as pd
import plotly.express as px
# This ensures Plotly output works in multiple places:
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
Function to Load csv
# Function to Load csv
def load_csv(file):
csv = pd.read_csv(file, sep=",")
return csv
Source 1 - Malaria Dataset
Source 2 (Additional) - Country Mapping - ISO, Continent, Region
Load File 1 - ISO-3166 Country Code Information (Additional Dataset)
df_continents = load_csv("./data/additional/continents2.csv")
df_continents.head()
| name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 1 | Åland Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | NaN | 150.0 | 154.0 | NaN |
| 2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
| 3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
| 4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN |
Load File 2 - Malaria deaths by Age
df_m_deaths_age = load_csv("./data/malaria_deaths_age.txt")
df_m_deaths_age
| Unnamed: 0 | entity | code | year | age_group | deaths | |
|---|---|---|---|---|---|---|
| 0 | 1 | Afghanistan | AFG | 1990 | Under 5 | 184.606435 |
| 1 | 2 | Afghanistan | AFG | 1991 | Under 5 | 191.658193 |
| 2 | 3 | Afghanistan | AFG | 1992 | Under 5 | 197.140197 |
| 3 | 4 | Afghanistan | AFG | 1993 | Under 5 | 207.357753 |
| 4 | 5 | Afghanistan | AFG | 1994 | Under 5 | 226.209363 |
| ... | ... | ... | ... | ... | ... | ... |
| 30775 | 30776 | Zimbabwe | ZWE | 2012 | 50-69 | 103.185111 |
| 30776 | 30777 | Zimbabwe | ZWE | 2013 | 50-69 | 100.113293 |
| 30777 | 30778 | Zimbabwe | ZWE | 2014 | 50-69 | 99.013890 |
| 30778 | 30779 | Zimbabwe | ZWE | 2015 | 50-69 | 98.091738 |
| 30779 | 30780 | Zimbabwe | ZWE | 2016 | 50-69 | 97.402058 |
30780 rows × 6 columns
Load File 3 - Incidence of Malaria (per 1,000 population at risk)
df_m_inc = load_csv("./data/malaria_inc.txt")
df_m_inc.head()
| Entity | Code | Year | Incidence of malaria (per 1,000 population at risk) (per 1,000 population at risk) | |
|---|---|---|---|---|
| 0 | Afghanistan | AFG | 2000 | 107.100000 |
| 1 | Afghanistan | AFG | 2005 | 46.500000 |
| 2 | Afghanistan | AFG | 2010 | 23.900000 |
| 3 | Afghanistan | AFG | 2015 | 23.600000 |
| 4 | Algeria | DZA | 2000 | 0.037746 |
Load File 4 - Malaria Deaths (per 100,000 people)
df_m_deaths = load_csv("./data/malaria_deaths.txt")
df_m_deaths.head()
| Entity | Code | Year | Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people) | |
|---|---|---|---|---|
| 0 | Afghanistan | AFG | 1990 | 6.802930 |
| 1 | Afghanistan | AFG | 1991 | 6.973494 |
| 2 | Afghanistan | AFG | 1992 | 6.989882 |
| 3 | Afghanistan | AFG | 1993 | 7.088983 |
| 4 | Afghanistan | AFG | 1994 | 7.392472 |
# For File 2 - Malaria deaths by Age
df_m_deaths_age_joined = df_m_deaths_age.merge(df_continents,how="left",left_on="code",right_on="alpha-3")
df_m_deaths_age_joined.head()
| Unnamed: 0 | entity | code | year | age_group | deaths | name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Afghanistan | AFG | 1990 | Under 5 | 184.606435 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 1 | 2 | Afghanistan | AFG | 1991 | Under 5 | 191.658193 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 2 | 3 | Afghanistan | AFG | 1992 | Under 5 | 197.140197 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 3 | 4 | Afghanistan | AFG | 1993 | Under 5 | 207.357753 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 4 | 5 | Afghanistan | AFG | 1994 | Under 5 | 226.209363 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
# For File 3 - Incidence of Malaria (per 1,000 population at risk)
df_m_inc_joined = df_m_inc.merge(df_continents,how="left",left_on="Code",right_on="alpha-3")
df_m_inc_joined.head()
| Entity | Code | Year | Incidence of malaria (per 1,000 population at risk) (per 1,000 population at risk) | name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 2000 | 107.100000 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 1 | Afghanistan | AFG | 2005 | 46.500000 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 2 | Afghanistan | AFG | 2010 | 23.900000 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 3 | Afghanistan | AFG | 2015 | 23.600000 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 4 | Algeria | DZA | 2000 | 0.037746 | Algeria | DZ | DZA | 12.0 | ISO 3166-2:DZ | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
# For File 4 - Malaria Deaths (per 100,000 people)
df_m_deaths_joined = df_m_deaths.merge(df_continents,how="left",left_on="Code",right_on="alpha-3")
df_m_deaths_joined.head()
| Entity | Code | Year | Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people) | name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 1990 | 6.802930 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 1 | Afghanistan | AFG | 1991 | 6.973494 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 2 | Afghanistan | AFG | 1992 | 6.989882 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 3 | Afghanistan | AFG | 1993 | 7.088983 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
| 4 | Afghanistan | AFG | 1994 | 7.392472 | Afghanistan | AF | AFG | 4.0 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
print(f"Null rows after merge: {df_m_deaths_age_joined['alpha-3'].isnull().sum()}")
# To replace null rows with "Others"
df_m_deaths_age_joined[["alpha-3","region","sub-region"]] = df_m_deaths_age_joined[["alpha-3","region","sub-region"]].fillna("Others")
print(f"Null rows after clean-up: {df_m_deaths_age_joined['alpha-3'].isnull().sum()}")
Null rows after merge: 4455 Null rows after clean-up: 0
DataFrame = df_m_inc_joined
print(f"Null rows after merge: {df_m_inc_joined['alpha-3'].isnull().sum()}")
# To replace null rows with "Others"
df_m_inc_joined[["alpha-3","region","sub-region"]] = df_m_inc_joined[["alpha-3","region","sub-region"]].fillna("Others")
print(f"Null rows after clean-up: {df_m_inc_joined['alpha-3'].isnull().sum()}")
Null rows after merge: 112 Null rows after clean-up: 0
DataFrame = df_m_deaths_age_joined
print(f"Null rows after merge: {df_m_deaths_age_joined['alpha-3'].isnull().sum()}")
Null rows after merge: 0
df_m_deaths_joined_grouped_region = df_m_deaths_joined.groupby(["region","Code","Entity"])["Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people)"]
df_plot_overview = pd.DataFrame(df_m_deaths_joined_grouped_region.mean()) \
.reset_index() \
.rename({"Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people)":"average"},axis='columns')
# Filter only locations that are above the median value.
median_death = df_plot_overview["average"].median()
df_plot_overview = df_plot_overview[df_plot_overview["average"]>median_death]
df_plot_overview
| region | Code | Entity | average | |
|---|---|---|---|---|
| 0 | Africa | AGO | Angola | 32.126978 |
| 1 | Africa | BDI | Burundi | 131.296440 |
| 2 | Africa | BEN | Benin | 77.581757 |
| 3 | Africa | BFA | Burkina Faso | 169.667275 |
| 4 | Africa | BWA | Botswana | 1.338110 |
| ... | ... | ... | ... | ... |
| 185 | Oceania | GUM | Guam | 0.436020 |
| 188 | Oceania | MNP | Northern Mariana Islands | 0.262181 |
| 190 | Oceania | PNG | Papua New Guinea | 30.445500 |
| 191 | Oceania | SLB | Solomon Islands | 46.817369 |
| 193 | Oceania | VUT | Vanuatu | 13.443657 |
97 rows × 4 columns
fig = px.scatter_geo(df_plot_overview,
title="Overview: Average Malaria Deaths (per 1,000 population at risk) from " + str(df_m_deaths_joined["Year"].min()) + " to " + str(df_m_deaths_joined["Year"].max()),
locations="Code",
size="average",
locationmode = "ISO-3",
color = "average",
custom_data=[
'Entity',
'region'
],
hover_data = {'average':':.2f'},
hover_name = "Entity",
labels={
'average':'Average Malaria Deaths'
},
projection = 'natural earth'
)
fig.update_layout(
margin=dict(l=0, r=0, t=70, b=0),
)
fig.show()
# Preparing Data
df_m_deaths_joined_grouped = df_m_deaths_joined.groupby(["Year","sub-region"])["Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people)"]
df_plot1 = pd.DataFrame(df_m_deaths_joined_grouped.mean()) \
.reset_index() \
.rename({"Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people)":"average"},axis='columns') \
df_plot1
| Year | sub-region | average | |
|---|---|---|---|
| 0 | 1990 | Australia and New Zealand | 0.000000 |
| 1 | 1990 | Central Asia | 0.037717 |
| 2 | 1990 | Eastern Asia | 0.016959 |
| 3 | 1990 | Eastern Europe | 0.000000 |
| 4 | 1990 | Latin America and the Caribbean | 0.331001 |
| ... | ... | ... | ... |
| 454 | 2016 | Southern Asia | 2.263307 |
| 455 | 2016 | Southern Europe | 0.000000 |
| 456 | 2016 | Sub-Saharan Africa | 41.574655 |
| 457 | 2016 | Western Asia | 0.005493 |
| 458 | 2016 | Western Europe | 0.000000 |
459 rows × 3 columns
def plot1(df):
fig = px.line(df, x="Year", y="average",
color="sub-region",
title='Average Malaria Deaths by Region from Years ' + str(df["Year"].min()) + ' to ' + str(df["Year"].max()),
labels={
'sub-region':"Sub-region",
'average':'Average Malaria Deaths (per 1,000 population at risk)'
},
hover_data={
'average':':.2f'
}
)
fig.update_traces(mode="markers+lines")
fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.update_layout(
xaxis=dict(
rangeslider=dict(
visible=True
)
)
)
return fig
plot1(df_plot1).show()
df_plot2 = df_m_inc_joined[(df_m_inc_joined["sub-region"] == "Sub-Saharan Africa")] \
.rename({"Incidence of malaria (per 1,000 population at risk) (per 1,000 population at risk)":"incidence"},axis='columns')
# To find the average of the incendence of Malaria by "Intermediate-region"
df_plot2 = df_plot2.groupby(["intermediate-region","Year"])["incidence"].mean()
df_plot2 = df_plot2.reset_index()
df_plot2
| intermediate-region | Year | incidence | |
|---|---|---|---|
| 0 | Eastern Africa | 2000 | 316.325000 |
| 1 | Eastern Africa | 2005 | 214.468750 |
| 2 | Eastern Africa | 2010 | 172.206250 |
| 3 | Eastern Africa | 2015 | 134.337500 |
| 4 | Middle Africa | 2000 | 377.222222 |
| 5 | Middle Africa | 2005 | 352.611111 |
| 6 | Middle Africa | 2010 | 224.422222 |
| 7 | Middle Africa | 2015 | 191.722222 |
| 8 | Southern Africa | 2000 | 34.850000 |
| 9 | Southern Africa | 2005 | 17.375000 |
| 10 | Southern Africa | 2010 | 3.700000 |
| 11 | Southern Africa | 2015 | 4.850000 |
| 12 | Western Africa | 2000 | 408.562500 |
| 13 | Western Africa | 2005 | 363.975000 |
| 14 | Western Africa | 2010 | 331.825000 |
| 15 | Western Africa | 2015 | 263.500000 |
def plot2(df):
fig = px.line(df, x="Year", y="incidence",
color="intermediate-region",
title='Incidence of Malaria by Region from Years ' + str(df["Year"].min()) + ' to ' + str(df["Year"].max()) + " - Sub-Saharan Africa",
labels={'incidence':'Average Incidence of malaria (per 1,000 population at risk)'},
hover_data={
'incidence':':.2f'
}
)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = list(df["Year"].unique()),
ticktext = list(range(df["Year"].min(),df["Year"].max()+1,5)) # Noted that data is every 5 years
)
)
fig.update_traces(mode="markers+lines")
fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
return fig
plot2(df_plot2).show()
df_plot3 = df_m_deaths_age_joined[df_m_deaths_age_joined["intermediate-region"]=="Western Africa"]
df_plot3 = df_plot3.groupby(["year","age_group"])["deaths"].mean().reset_index()
df_plot3
| year | age_group | deaths | |
|---|---|---|---|
| 0 | 1990 | 15-49 | 655.579646 |
| 1 | 1990 | 5-14 | 1323.488103 |
| 2 | 1990 | 50-69 | 559.570631 |
| 3 | 1990 | 70 or older | 356.248023 |
| 4 | 1990 | Under 5 | 17260.153428 |
| ... | ... | ... | ... |
| 130 | 2016 | 15-49 | 1535.348875 |
| 131 | 2016 | 5-14 | 2081.789766 |
| 132 | 2016 | 50-69 | 996.492470 |
| 133 | 2016 | 70 or older | 678.294092 |
| 134 | 2016 | Under 5 | 20555.300119 |
135 rows × 3 columns
def plot3(df):
fig = px.line(df, x="year", y="deaths",
color="age_group",
title='Average Malaria Deaths by Age Group in Western Africa from Years ' + str(df["year"].min()) + ' to ' + str(df["year"].max()),
labels={'deaths':'Average Deaths'},
hover_data={
'deaths':':,.2f'
}
)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = list(df["year"].unique()),
ticktext = list(range(df["year"].min(),df["year"].max()+1,1))
)
)
fig.update_traces(mode="markers+lines")
fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.update_layout(
xaxis=dict(
rangeslider=dict(
visible=True
)
)
)
return fig
plot3(df_plot3).show()